********************************************************************************
* 
* Creating variables in CPS data, imposing sample restrictions, creating instruments
/* 
		Part 1 - merging CPS data to mci_uniq_id9 
		Part 2 - creating variables 
		Part 3 - sample restrictions 
		Part 4 - data cleaning so that only one observation per referral_ID-mci_uniq_id9 
		Part 5 - creating instruments 
*/   
********************************************************************************

clear all 


********** Part 1 - Merging in IDs to CPS data 
	
import excel using "$data\Raw\allegheny_raw_25nov19\MCI matching file Updated.xlsx", firstrow case(l) clear sheet("Sheet 1") allstring 

destring mci_uniq_id mci_id client_id, replace 
merge m:1 mci_uniq_id mci_id client_id using "$data\Processed\IDmasterfile", gen(_mdup)

preserve 
replace mci_uniq_id9=mci_uniq_id if mci_uniq_id9==. 
keep client_id mci_uniq_id9 _mdup
duplicates drop 
duplicates tag client_id, gen(tag)
tab tag //96 cases (60 without replacing mci_uniq_id9 that were missing) where a given client_id is matched to two mci_uniq_id9's. Minor enough of an issue given the sample size, that I will just keep the highest mci_uniq_id9 prior to replacing missings, for simplicity's sake 
bys client_id (_mdup mci_uniq_id9): keep if _n==_N 
count if mci_uniq_id9==. 
drop tag _mdup
tempfile ids1 
save "`ids1'"
restore 

use "$data\Processed\ReferralClient.dta", clear 
drop if cl_id==. & mci_id==. //21 observations deleted 

*adding in information on who is a client_id following a TPR or an unknown person 
rename cl_id client_id 
merge m:1 client_id using "$data\processed\IDtprORunk"
drop if _merge==2 
drop _merge 

*merging in mci_uniq_id9 using client_id 
merge m:1 client_id using "`ids1'"
drop if _merge==2 
replace mci_id=. if mci_id==0
gen notinMCImatchingfile=(_merge==1)
drop _merge 


********** Part 2 - Generating variables as needed 

*clean date variable 
split dob, parse(" ")
gen dob_mdy=date(dob1,"DMY")
format dob_mdy %td
drop dob1 dob2 dob 

* from code #1 

*creating new age variables and DVs for children 
gen age_cleaned=round((intake_dt_mdy-dob_mdy)/365,0.1)
gen minor=age_cleaned<18 if age_cleaned<. 
tab minor all_child, m 
replace age_cleaned=. if age_cleaned<0 

*Creating variables as needed 
gen female=(gender=="Female") if gender!="" & gender!="Unknown"
gen white=(race_group=="White") if (race_group!=""&race_group!="Unable to Determine"&race_group!="Other/Unknown")
gen hispanic=(ethncty_type!="Not Hispanic") if !(ethncty_type==""|ethncty_type=="Unable to determine" | ethncty_type=="Did Not Ask"|ethncty_type=="Declined to Answer")

*Cleaning demographics 
foreach x of varlist race_group ethncty_type gender {
tab `x'
gen `x'_cleaned=`x'
}
replace gender_cleaned="" if gender=="Unknown" 
replace ethncty_type_cleaned="" if ethncty_type=="Declined to Answer" | ethncty_type=="Did Not Ask" | ethncty_type=="Unable to determine" 
replace race_group_cleaned="" if race_group=="Unable to Determine" | race_group=="Other/Unknown" 


*year FE 
tab intake_dt_y, gen(year)

*Regional offices dummies. I initially had used actual handling regional office, but since those are associated with the last caseworker assigned, they are not not necessarily the ones from whom the investigator was from. Hence the new set of regional dummies 
foreach i in 1 N _ {
foreach x in CRO ERO INTAKE MVRO NRO SRO {
gen `x'_inv`i'=(regional_office_inv`i'=="`x'")
}
}


gen byte mom= (parent==1 & gender_cleaned=="Female" )
gen byte dad= (parent==1 & gender_cleaned=="Male"  ) 

sort mci_uniq_id9 refer_id intake_dt_mdy 
by mci_uniq_id9: gen referralnbinsample=_n 
tab referralnbinsample
sort refer_id 
by refer_id: egen hadremovedchild3m=max(within3m)
by refer_id: egen hadremovedchild1y=max(within1y)
gen anyremoved3m = hadremovedchild3m>0 &hadremovedchild3m<. 
gen anyremoved1y=hadremovedchild1y>0 &hadremovedchild1y<. 



gen v1=strpos(race, "Black")
gen black=v1!=0 if race!="" & race_group!="Unable to Determine" 
gen black2=black 
replace black2=0 if black==. 
gen missingrace=white==. 
gen white2=white 
replace white2=0 if white==. 
gen missingethn=hispanic==. 
gen hispanic2=hispanic 
replace hispanic2=0 if hispanic==. 


forval x=1/22 { 
	local name: variable label allegation`x'
	local allegationname=substr("`name'",19,.)
	gen listedallegation`x'=allegation`x'>0 if allegation`x'<. 
	gen validallegation`x'=allegation`x'>2 if allegation`x'<. 
	label var listedallegation`x' "Listed `allegationname'"
	label var validallegation`x' "Indicated or Valid `allegationname'"
}
egen personreferral=group(mci_uniq_id9 refer_id) //FEs will be person-referral specific 
//encode regional_office_inv$invtype, gen(regional_office_inv$invtypeen)


gen openonly=accept==1&anyremoved3m==0 
gen openonly1y=accept==1&anyremoved1y==0 

*labels 
label var mandated "Mandated Reporter"
label var white2 "White"
label var hispanic2 "Hispanic"
label var missingrace "Race Missing"
label var missingethn "Ethnicity Missing"
label var female "Women"
label var age_cleaned "Age"




*************** Part 3 - Sample restrictions to reduce cleaning that is needed 

* Dropping older data because not reliable 
drop if intake_dt_my<tm(2008m9) //neither referral nor acceptance are realiable yet 
drop if intake_dt_my<tm(2009m5) //removals are not comprehensive yet. I only loose a handful of supervisors



**************** Part 4 - Dropping a few duplicates reported more than once on referral - Manually going through them

drop refer_cl
drop addrs_line1_txt addrs_line2_txt city_nme  zip_nbr_txt
duplicates tag mci_uniq_id9 refer_id, gen(dup)
tab dup 
*how many of these are just due to the id variables and variables I generated being different?
	preserve 
	drop client_id mci_id referralnbinsample  
	duplicates drop 
	drop dup 
	duplicates tag mci_uniq_id9 refer_id, gen(dup)
	tab dup 
	restore 
//small enough number of cases that it seems reasonable to arbitrarily choose one obs from each 
bys refer_id mci_uniq_id9 (parent aprp race_group_cleaned dob_mdy client_id): keep if _n==_N
drop dup 
duplicates report mci_uniq_id9 refer_id //all good! 




**************** Part 5 - Creating instruments 


**** Constructing the Investigator propensities 

*Constructing at the referral level
preserve 
	keep if notes_INV=="Choice between Several Investigators" | notes_INV=="First 1-day assignment" | notes_INV=="One Investigator Assignment"
	bys refer_id: keep if _n==1 
	local invtype N 
	bys regional_office_inv`invtype' caseworker_id_inv`invtype': gen nbreferrals_inv`invtype'=_N 
	egen FE_oy=group(regional_office_inv`invtype' intake_dt_y)
foreach x in accept anyremoved3m openonly openonly1y anyremoved1y { 
	bys regional_office_inv`invtype' caseworker_id_inv`invtype': egen num=total(`x')
	gen prop`x'`invtype' = (num-`x')/(nbreferrals_inv`invtype'-1)
	reg prop`x'`invtype' i.FE_oy 
	predict p, xb 
	gen prop`x'res`invtype'=prop`x'`invtype'-p 
	drop num p 
	summ prop`x'res`invtype' 
	gen prop`x'resSD`invtype'=prop`x'res`invtype'/`r(sd)'
	}
	drop FE_oy
keep refer_id prop*accept* prop*openonly* prop*anyremoved3m* nbreferrals_inv*
tempfile newiv 
save "`newiv'"
restore 

merge m:1 refer_id using "`newiv'"
tab _merge screenin 
drop _merge 


* First referral 
bys client_id (refer_id): gen firstref=_n==1 
label var firstref "First referral in sample"

* In the 3m prior to election 
gen election_3m=0 
replace election_3m=1 if intake_dt_mdy<=td(4Nov2008) & intake_dt_mdy>td(4Aug2008)
replace election_3m=1 if intake_dt_mdy<=td(2Nov2010) & intake_dt_mdy>td(2Aug2010)
replace election_3m=1 if intake_dt_mdy<=td(6Nov2012) & intake_dt_mdy>td(6Aug2012)
replace election_3m=1 if intake_dt_mdy<=td(4Nov2014) & intake_dt_mdy>td(4Aug2014)
replace election_3m=1 if intake_dt_mdy<=td(8Nov2016) & intake_dt_mdy>td(8Aug2016)
replace election_3m=1 if intake_dt_mdy<=td(6Nov2018) & intake_dt_mdy>td(6Aug2018)
replace election_3m=1 if intake_dt_mdy<=td(3Nov2020) & intake_dt_mdy>td(3Aug2020)
label var election_3m "Referral within 3months prior to election"





save "$data\Processed\CPSdata", replace 



